header <- c('entry', 'enzyme', 'Km',
'accession', 'KEGG', 'binomial',
'type', 'reference')
oxidases <- readxl::read_xlsx('HBH_table_v3.xlsx',
col_names = header,
skip = 1) %>%
mutate(across(c(entry, accession, KEGG, binomial, type), factor))
ko <- read_delim('ko.csv', ';')
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## KEGG = col_character(),
## definition = col_character()
## )
oxidases <- oxidases %>%
mutate(across(c('binomial', 'KEGG'), ~ fct_relevel(.x, sort))) %>%
mutate(genus = str_extract(binomial, '[^ ]+')) %>%
left_join(ko, 'KEGG')
taxon_levels <- c('superkingdom', 'phylum', 'class', 'order', 'family', 'genus')
genera <- oxidases %>%
pull(genus) %>%
as.character() %>%
sort() %>%
unique()
oxidases <- oxidases %>%
mutate(EC = .$definition %>%
str_extract('EC:[^\\]]+') %>%
str_remove('EC:'),
EC_f = EC %>%
str_extract('[0-9]\\.[0-9]+\\.[0-9]+'),
EC_f = ifelse(!is.na(EC_f), paste0(EC_f, '.-'), EC_f),
EC_sf = EC %>%
str_extract('[0-9]\\.[0-9]+'),
EC_sf = ifelse(!is.na(EC_sf), paste0(EC_sf, '.-.-'), EC_sf))
# taxonomy_map <- genera %>%
# purrr::map(~ myTAI::taxonomy(organism = .x, db = 'ncbi', output = 'classification'))
# saveRDS(taxonomy_map, 'taxonomy_map.rds')
taxonomy_map <- readRDS('taxonomy_map.rds')
taxonomy_data <- taxonomy_map %>%
purrr::map_dfr(~ .x %>%
filter(rank %in% taxon_levels) %>%
select(-id) %>%
pivot_wider(names_from = rank,
values_from = name)) %>%
select(all_of(taxon_levels))
oxidases <- oxidases %>%
left_join(taxonomy_data, by = 'genus')
oxidases <- oxidases %>%
mutate(subtype = type) %>%
mutate(subtype = case_when(
grepl('di[)]*oxygenase', definition, perl=T, ignore.case = T) ~ 'dioxygenase',
grepl('mono[-]*oxygenase', definition, perl=T, ignore.case = T) ~ 'monooxygenase',
grepl('hydroxylase', definition, perl=T, ignore.case = T) ~ 'hydroxylase',
type == 'terminal' ~ 'terminal',
TRUE ~ 'other',
)) %>%
dplyr::distinct()
types <- oxidases %>%
pull(type) %>%
levels()
color_map <- c('royalblue', 'goldenrod')
names(color_map) <- types
logbreaks <- c(0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0)
loglabels <- c('0.001', '0.01', '0.10', '1.0', '10.0', '100.0', '1000.0')
oxidases %>%
ggplot(aes(x = Km, color = type))+
geom_density() +
scale_x_continuous(trans = reverselog_trans(10),
breaks = logbreaks,
labels = loglabels) +
scale_color_manual(values = color_map) +
geom_vline(xintercept = 25, linetype = 'dashed') +
theme_bw()

p1 <- oxidases %>%
ggplot(aes(x = Km, y = type, fill = type, color = type )) +
geom_boxplot(outlier.shape = 21) +
scale_x_continuous(trans = reverselog_trans(10),
breaks = logbreaks,
labels = loglabels) +
geom_vline(xintercept = 25, linetype = 'dashed') +
scale_fill_manual(values = color_map,
name = 'oxidase type',
guide = guide_legend(reverse=TRUE),
aesthetics = c('color', 'fill')) +
theme_minimal() +
theme(axis.ticks.y = element_blank(),
axis.text.y = element_blank(),
# legend.title = element_blank(),
axis.title.y = element_blank(),
axis.text.x = element_text(colour="darkgrey"),
axis.ticks.x = element_line(colour="darkgrey"),
panel.grid.minor = element_blank(),
panel.grid.major.y = element_blank(),
) +
stat_summary(geom = "crossbar",
show.legend = F,
width=0.65,
fatten=0,
color="white",
fun.data = function(x) {
return(c(y=median(x),
ymin=median(x),
ymax=median(x)))
}) +
theme(legend.position = c(0.20, 0.75))
save_plot('oxidase_boxplot_style1.png', p1)
p1.1 <- oxidases %>%
ggplot(aes(x = Km, y = type, fill = type), color = 'gray') +
geom_boxplot(outlier.shape = 21) +
scale_x_continuous(trans = reverselog_trans(10),
breaks = logbreaks,
labels = loglabels) +
geom_vline(xintercept = 25, linetype = 'dashed') +
scale_fill_manual(values = color_map,
name = 'oxidase type',
guide = guide_legend(reverse=TRUE),
aesthetics = c('fill')) +
theme_bw() +
theme(axis.ticks.y = element_blank(),
axis.text.y = element_blank(),
axis.title.y = element_blank())
p1.2 <- p1.1 + annotate('text', x = 0.025, y = 0.5, label = 'p-value < 2e-16')
# Box cox transform and natural log transform produce the same result
oxidases.lmer <- oxidases %>%
group_by(type) %>%
mutate(Km_bc = car::bcPower(Km, 0),
Km_log = log(Km)) %>%
filter(!is.na(accession))
oxidases.lmer %>% select(Km_bc, Km_log)
## Adding missing grouping variables: `type`
## # A tibble: 333 x 3
## # Groups: type [2]
## type Km_bc Km_log
## <fct> <dbl> <dbl>
## 1 other 4.83 4.83
## 2 other 4.69 4.69
## 3 other 4.01 4.01
## 4 other 5.19 5.19
## 5 other 2.46 2.46
## 6 other 2.30 2.30
## 7 other 7.00 7.00
## 8 other 3.22 3.22
## 9 other 4.25 4.25
## 10 other 3.33 3.33
## # … with 323 more rows
oxidases.lmer %>% filter(type == 'terminal') %>%
ggplot(aes(sample = Km_bc)) + stat_qq() + stat_qq_line()

lmer_res <- lmerTest::lmer(Km_log ~ type * (1 | accession ),
oxidases.lmer,
REML = F)
## Registered S3 methods overwritten by 'lme4':
## method from
## cooks.distance.influence.merMod car
## influence.merMod car
## dfbeta.influence.merMod car
## dfbetas.influence.merMod car
lmerTest::ranova(lmer_res) # Random effect makes fit better
## ANOVA-like table for random-effects: Single term deletions
##
## Model:
## Km_log ~ type + (1 | accession)
## npar logLik AIC LRT Df Pr(>Chisq)
## <none> 4 -615.52 1239.0
## (1 | accession) 3 -666.54 1339.1 102.04 1 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
lmer_res.summary <- lmer_res %>% summary()
lmer_res.summary # Terminal & Other are significantly different fits
## Linear mixed model fit by maximum likelihood . t-tests use Satterthwaite's
## method [lmerModLmerTest]
## Formula: Km_log ~ type * (1 | accession)
## Data: oxidases.lmer
##
## AIC BIC logLik deviance df.resid
## 1239.0 1254.3 -615.5 1231.0 329
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -2.45779 -0.50634 0.01752 0.48424 2.87124
##
## Random effects:
## Groups Name Variance Std.Dev.
## accession (Intercept) 1.751 1.323
## Residual 1.513 1.230
## Number of obs: 333, groups: accession, 126
##
## Fixed effects:
## Estimate Std. Error df t value Pr(>|t|)
## (Intercept) 4.3345 0.1697 116.9544 25.55 <2e-16 ***
## typeterminal -4.9265 0.3340 128.7363 -14.75 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr)
## typeterminl -0.508
p1.1 %>% save_plot('oxidase_boxplot_style2.png', .)
p1.2 %>% save_plot('oxidase_boxplot_style2_pval.png', .)
p1.2 %>% save_plot('oxidase_boxplot_style2_pval.svg', .)
p_phylum <- oxidases %>% ggplot(aes(y = Km, x = phylum, fill = phylum)) +
geom_boxplot() +
facet_wrap(~type, ncol=1, scales = 'free_y') +
scale_y_continuous(trans = reverselog_trans(10),
breaks = logbreaks,
labels = loglabels) +
scale_fill_brewer(palette = 'Set1') +
coord_flip() +
cowplot::theme_cowplot()
cowplot::save_plot('boxplot_phylum.png', p_phylum, base_height=5)
message('Figure 1')
## Figure 1
p1.2

p2 <- oxidases %>%
ggplot(aes(x = Km, group = type, fill = type)) +
geom_density(alpha=.6) +
scale_x_continuous(trans = reverselog_trans(10),
breaks = logbreaks,
labels = loglabels) +
geom_vline(xintercept = 25, linetype = 'dashed') +
scale_fill_manual(values = color_map,
name = 'oxidase type',
guide = guide_legend(reverse=TRUE)) +
theme_bw() +
theme(axis.ticks.y = element_blank(),
axis.text.y = element_blank(),
axis.title.y = element_blank(),
panel.grid.minor = element_blank(),
panel.grid.major.y = element_blank(),
legend.position = c(0.85,0.75),
legend.box.background = element_rect(color = 'white'),
)
p3 <- oxidases %>%
ggplot(aes(x = Km, y = type, fill = type)) +
ggridges::geom_density_ridges(alpha=.75, rel_min_height = 0.01) +
scale_x_continuous(trans = reverselog_trans(10),
breaks = logbreaks,
labels = loglabels,
expand = c(0,0)) +
scale_y_discrete(expand = c(0,0)) +
coord_cartesian(clip = 'off') +
geom_vline(xintercept = 25, linetype = 'dashed') +
scale_fill_manual(values = color_map,
name = 'oxidase type',
guide = guide_legend(reverse=TRUE)) +
ggridges::theme_ridges() +
theme(axis.ticks.y = element_blank(),
axis.text.y = element_blank(),
axis.title.y = element_blank(),
legend.position = c(0.05,0.85),
legend.box.background = element_rect(color = 'white'))
p2 %>% save_plot('oxidase_density_plot_cb.png', .)
p3 %>% save_plot('oxidase_ridge_plot.png', .)
## Picking joint bandwidth of 0.284
p2

p3
## Picking joint bandwidth of 0.284

(oxidases %>%
ggplot(aes(y = genus, x = Km, color = type, label = KEGG, text = definition)) +
geom_point() +
scale_x_continuous(trans = reverselog_trans(10),
breaks = logbreaks,
labels = loglabels) +
geom_vline(xintercept = 25, linetype = 'dashed') +
scale_color_manual(values = color_map) +
theme_bw()) %>%
plotly::ggplotly()
oxidases %>%
filter(!is.na(superkingdom)) %>%
ggplot(aes(y = superkingdom, x = Km, fill = type)) +
geom_boxplot() +
scale_x_continuous(trans = reverselog_trans(10),
breaks = logbreaks,
labels = loglabels) +
geom_vline(xintercept = 25, linetype = 'dashed') +
scale_fill_manual(values = color_map) +
theme_bw()

oxidases %>%
filter(type == 'terminal') %>%
pull(definition) %>%
table()
## .
## cytochrome aa3-600 menaquinol oxidase subunit I [EC:7.1.1.5]
## 3
## cytochrome bd ubiquinol oxidase subunit I [EC:7.1.1.7]
## 20
## cytochrome c oxidase cbb3-type subunit I [EC:7.1.1.9]
## 11
## cytochrome c oxidase subunit 1 [EC:7.1.1.9]
## 11
## cytochrome c oxidase subunit I [EC:7.1.1.9]
## 8
## cytochrome o ubiquinol oxidase subunit I [EC:7.1.1.3]
## 21
(oxidases %>%
ggplot(aes(y = KEGG, x = Km, color = type, label = binomial, text = definition)) +
geom_point() +
scale_x_continuous(trans = reverselog_trans(10),
breaks = logbreaks,
labels = loglabels) +
geom_vline(xintercept = 25, linetype = 'dashed') +
scale_color_manual(values = color_map) +
theme_bw()) %>%
plotly::ggplotly()
oxidases %>%
filter(type == 'terminal') %>%
pull(definition) %>%
table()
## .
## cytochrome aa3-600 menaquinol oxidase subunit I [EC:7.1.1.5]
## 3
## cytochrome bd ubiquinol oxidase subunit I [EC:7.1.1.7]
## 20
## cytochrome c oxidase cbb3-type subunit I [EC:7.1.1.9]
## 11
## cytochrome c oxidase subunit 1 [EC:7.1.1.9]
## 11
## cytochrome c oxidase subunit I [EC:7.1.1.9]
## 8
## cytochrome o ubiquinol oxidase subunit I [EC:7.1.1.3]
## 21
(oxidases %>%
filter(type == 'terminal') %>%
ggplot(aes(y = definition, x = Km, color = class, label = binomial, text = definition)) +
geom_point() +
scale_x_continuous(trans = reverselog_trans(10),
breaks = logbreaks,
labels = loglabels) +
geom_vline(xintercept = 25, linetype = 'dashed') +
scale_color_brewer(type = 'qual') +
theme_bw()) %>%
plotly::ggplotly()
headers <- c('KEGG', 'gene_name', 'annotation', 'E_Score', 'P_Score')
enzclass.sf <- read_tsv('enzclass.tsv', col_names = c('EC_sf', 'Definition_sf'), skip = 1)
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## EC_sf = col_character(),
## Definition_sf = col_character()
## )
enzclass.f <- read_tsv('enzclass.tsv', col_names = c('EC_f', 'Definition_f'), skip = 1)
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## EC_f = col_character(),
## Definition_f = col_character()
## )
oxidases <- oxidases %>%
left_join(enzclass.sf, by = 'EC_sf') %>%
left_join(enzclass.f, by = 'EC_f')
oxidases %>%
datatable(extensions = 'Buttons',
options = list(dom = 'Blfrtip',
scrollX = TRUE,
buttons = c('copy', 'csv', 'excel', 'pdf', 'print'),
lengthMenu = list(c(10,25,50,-1),
c(10,25,50,"All"))),
rownames = FALSE)
EC_count <- oxidases %>%
pull(EC_sf) %>%
n_distinct()
(oxidases %>%
filter(type == 'other') %>%
ggplot(aes(y = KEGG, x = Km, color = EC_sf, label = binomial, text = definition)) +
geom_point() +
scale_x_continuous(trans = reverselog_trans(10),
breaks = logbreaks,
labels = loglabels) +
geom_vline(xintercept = 25, linetype = 'dashed') +
scale_color_manual(values = colorRampPalette(brewer.pal(8, 'Set1'))(EC_count),
na.value = '#000000') +
theme_bw()) %>%
plotly::ggplotly()
EC_count <- oxidases %>%
pull(EC_f) %>%
n_distinct()
oxidases %>%
select(EC_f, Definition_f) %>%
group_by(EC_f) %>%
count()
## # A tibble: 19 x 2
## # Groups: EC_f [19]
## EC_f n
## <chr> <int>
## 1 1.1.3.- 20
## 2 1.1.99.- 2
## 3 1.10.3.- 11
## 4 1.13.11.- 94
## 5 1.13.12.- 8
## 6 1.14.11.- 22
## 7 1.14.12.- 9
## 8 1.14.13.- 5
## 9 1.14.16.- 21
## 10 1.14.17.- 5
## 11 1.14.99.- 10
## 12 1.17.1.- 2
## 13 1.18.1.- 2
## 14 1.3.3.- 2
## 15 1.4.3.- 15
## 16 1.5.3.- 5
## 17 1.7.3.- 3
## 18 7.1.1.- 74
## 19 <NA> 23
(oxidases %>%
filter(type == 'other') %>%
ggplot(aes(y = KEGG, x = Km, color = EC_f, label = binomial, text = definition)) +
geom_point() +
scale_x_continuous(trans = reverselog_trans(10),
breaks = logbreaks,
labels = loglabels) +
geom_vline(xintercept = 25, linetype = 'dashed') +
scale_color_manual(values = colorRampPalette(brewer.pal(8, 'Set1'))(EC_count),
na.value = '#000000') +
theme_bw()) %>%
plotly::ggplotly()
source('./xgfs_colors.R')
oxidases %>% select(enzyme, definition, subtype)
## # A tibble: 333 x 3
## enzyme definition subtype
## <chr> <chr> <chr>
## 1 4,5-PCD (protocatechuate 4,5-di… protocatechuate 4,5-dioxygenase, … dioxygen…
## 2 Plant -dioxygenase (PADOX-1) alpha-dioxygenase [EC:1.14.99.-] dioxygen…
## 3 GenDOPa (gentisate 1,2-dioxygen… gentisate 1,2-dioxygenase [EC:1.1… dioxygen…
## 4 3,4-Dihydroxy-9,10-secoandrosta… 3,4-dihydroxy-9,10-secoandrosta-1… dioxygen…
## 5 catechol 1,2-dioxygenase catechol 1,2-dioxygenase [EC:1.13… dioxygen…
## 6 catechol 1,2-dioxygenase catechol 1,2-dioxygenase [EC:1.13… dioxygen…
## 7 OsARD1 (aci-reductone dioxygena… 1,2-dihydroxy-3-keto-5-methylthio… dioxygen…
## 8 2-monooxygenase phenol/toluene 2-monooxygenase (N… monooxyg…
## 9 2,3-dioxygenase benzene/toluene/chlorobenzene dio… dioxygen…
## 10 xylene monooxygenase p-cymene methyl-monooxygenase ele… monooxyg…
## # … with 323 more rows
subtypes <- oxidases %>%
pull(subtype) %>%
as.factor() %>%
levels()
oxidases.other <- oxidases %>%
filter(subtype != 'terminal') %>%
mutate(subtype = factor(subtype))
subtype_labels.df <- oxidases.other %>%
count(subtype) %>%
mutate(ylabels = paste0(subtype, ' (n=', n, ')'))
subtype_labels <- subtype_labels.df$ylabels
names(subtype_labels) <- subtype_labels.df$subtype
(oxidases.other %>%
ggplot(aes(y = KEGG, x = Km, color = subtype, label = binomial, text = definition)) +
geom_point() +
scale_x_continuous(trans = reverselog_trans(10),
breaks = logbreaks,
labels = loglabels) +
geom_vline(xintercept = 25, linetype = 'dashed') +
# scale_color_manual(values = xgfs.6) +
scale_color_manual(values = cb_pal.12) +
theme_bw()) %>%
plotly::ggplotly()
oxidases.other %>%
ggplot(aes(y = subtype, x = Km, fill = subtype)) +
scale_y_discrete(limits=rev(levels(oxidases.other$subtype[1:4])),
labels=subtype_labels[1:4]) +
geom_boxplot(color='black') +
scale_x_continuous(trans = reverselog_trans(10),
breaks = logbreaks,
labels = loglabels) +
geom_vline(xintercept = 25, linetype = 'dashed') +
theme_bw() +
scale_fill_manual(values = cb_pal.12)

# oxidases %>% pull(Definition_sf) %>% table()
oxidases.summary <- oxidases %>%
group_by(type) %>%
summarise(count = n(),
median = median(Km),
min = min(Km),
max = max(Km),
IQR = IQR(Km),
)
## `summarise()` ungrouping output (override with `.groups` argument)
oxidases.summary %>%
write_tsv('oxidase_summary.tsv')
oxidases.summary
## # A tibble: 2 x 6
## type count median min max IQR
## <fct> <int> <dbl> <dbl> <dbl> <dbl>
## 1 other 259 60 0.3 3600 169.
## 2 terminal 74 0.365 0.003 20.8 2.39